{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compare multiple LLMs\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llmstudio import LLM\n",
    "from sentence_transformers import SentenceTransformer, util\n",
    "import asyncio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "claude2 = LLM(\"anthropic/claude-2.1\")\n",
    "gpt4 = LLM(\"openai/gpt-4-1106-preview\")\n",
    "gpt3 = LLM(\"openai/gpt-3.5-turbo\")\n",
    "\n",
    "# List of LLM objects\n",
    "llms = [claude2, gpt4, gpt3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to run a single task against all models and calculate metrics\n",
    "async def run_task(task, expected_output, llms):\n",
    "    # Run all models concurrently\n",
    "    responses = await asyncio.gather(*(llm.async_chat(task) for llm in llms), return_exceptions=True)\n",
    "    \n",
    "    # Dictionary to hold metrics by model\n",
    "    metrics_by_model = {}\n",
    "    \n",
    "    # Process responses and calculate metrics\n",
    "    for llm, response in zip(llms, responses):\n",
    "        # Store metrics for this model\n",
    "        metrics_by_model[llm.model] = {\n",
    "            'average_latency': response['metrics']['latency'],\n",
    "            'average_cost': response['metrics']['cost'],\n",
    "            'average_output_token': response['metrics']['output_tokens'],\n",
    "            'average_similarity': calculate_similarity(response['chat_output'], expected_output),\n",
    "            'average_time_to_first_token': response['metrics']['time_to_first_token'],\n",
    "            'average_inter_token_latency': response['metrics']['inter_token_latency'],\n",
    "            'average_tokens_per_second': response['metrics']['tokens_per_second']\n",
    "        }\n",
    "    \n",
    "    return metrics_by_model\n",
    "\n",
    "# Main function to run all tasks\n",
    "async def run_all_tasks(tasks, expected_outputs, llms):\n",
    "    all_metrics_by_model = {}\n",
    "    for task, expected_output in zip(tasks, expected_outputs):\n",
    "        task_metrics_by_model = await run_task(task, expected_output, llms)\n",
    "        for model, metrics in task_metrics_by_model.items():\n",
    "            # Aggregate metrics for each model\n",
    "            if model not in all_metrics_by_model:\n",
    "                all_metrics_by_model[model] = metrics\n",
    "            else:\n",
    "                for key, value in metrics.items():\n",
    "                    all_metrics_by_model[model][key] += value\n",
    "    \n",
    "    # Divide each metric by the number of tasks to get the average\n",
    "    for _, metrics in all_metrics_by_model.items():\n",
    "        for key in metrics:\n",
    "            metrics[key] /= len(tasks)\n",
    "    \n",
    "    return all_metrics_by_model\n",
    "\n",
    "# Function to calculate similarity\n",
    "def calculate_similarity(model_output, expected_output):\n",
    "    model = SentenceTransformer(\"paraphrase-MiniLM-L6-v2\")\n",
    "    embedding1 = model.encode(model_output, convert_to_tensor=True)\n",
    "    embedding2 = model.encode(expected_output, convert_to_tensor=True)\n",
    "    cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)\n",
    "    return cosine_scores.item()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Example usage\n",
    "tasks = [\n",
    "    \"\"\"\n",
    "    Task: What is the insurance final value to be refunded? Think step by step, and your only response should only be the refunded value in the following format 'Refund:$x', where x is the ammount to be refunded.\n",
    "\n",
    "    Q: I purchased my ticket for $200 but I was charged an extra 15% due to some insurance. Out of those 15%, 80% was insurance against baggage lost which I do want to keep. I want a refund on the insurance part that I do not want.\n",
    "\n",
    "    A: Let's think step by step\n",
    "    \"\"\"\n",
    "]\n",
    "\n",
    "expected_outputs = [\"Refund:$6\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "await run_all_tasks(tasks, expected_outputs, llms)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
